bitkeeper revision 1.1159.51.2 (412cb2dfaIDYjySJYYMTByGbcM77UA)
authorkaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk>
Wed, 25 Aug 2004 15:40:15 +0000 (15:40 +0000)
committerkaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk>
Wed, 25 Aug 2004 15:40:15 +0000 (15:40 +0000)
More grant-table code, and some related sundry improvements.

13 files changed:
xen/arch/x86/domain.c
xen/arch/x86/memory.c
xen/arch/x86/setup.c
xen/common/domain.c
xen/common/grant_table.c
xen/common/kernel.c
xen/common/page_alloc.c
xen/include/asm-x86/atomic.h
xen/include/asm-x86/mm.h
xen/include/asm-x86/smp.h
xen/include/asm-x86/system.h
xen/include/xen/grant_table.h
xen/include/xen/sched.h

index 87aa127781869915e20482e09e63b80d2cde46c0..8739506b73b3e864edf92a01e1b22a75cd51e27e 100644 (file)
@@ -668,9 +668,9 @@ int construct_dom0(struct domain *p,
           mfn++ )
     {
         page = &frame_table[mfn];
-        page->u.inuse.domain        = p;
+        page->u.inuse.domain     = p;
         page->u.inuse.type_info  = 0;
-        page->u.inuse.count_info = PGC_allocated | 1;
+        page->u.inuse.count_info = PGC_always_set | PGC_allocated | 1;
         list_add_tail(&page->list, &p->page_list);
         p->tot_pages++; p->max_pages++;
     }
index 9f8bca4603282ce241c0bb679b375f2bc8220466..52c9dcca8d9e81a9d7aaddf9b65727a837378de0 100644 (file)
@@ -153,6 +153,9 @@ void arch_init_memory(void)
     vm_assist_info[VMASST_TYPE_writable_pagetables].disable =
         ptwr_disable;
 
+    for ( mfn = 0; mfn < max_page; mfn++ )
+        frame_table[mfn].u.inuse.count_info |= PGC_always_set;
+
     /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
     memset(machine_to_phys_mapping, 0x55, 4<<20);
 
@@ -179,9 +182,9 @@ void arch_init_memory(void)
           mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
           mfn++ )
     {
-        frame_table[mfn].u.inuse.count_info = 1 | PGC_allocated;
-        frame_table[mfn].u.inuse.type_info  = 1 | PGT_gdt_page; /* non-RW */
-        frame_table[mfn].u.inuse.domain     = dom_xen;
+        frame_table[mfn].u.inuse.count_info |= PGC_allocated | 1;
+        frame_table[mfn].u.inuse.type_info   = PGT_gdt_page | 1; /* non-RW */
+        frame_table[mfn].u.inuse.domain      = dom_xen;
     }
 }
 
@@ -370,6 +373,7 @@ get_page_from_l1e(
 {
     unsigned long l1v = l1_pgentry_val(l1e);
     unsigned long pfn = l1_pgentry_to_pagenr(l1e);
+    struct pfn_info *page = &frame_table[pfn];
     extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
 
     if ( !(l1v & _PAGE_PRESENT) )
@@ -383,6 +387,8 @@ get_page_from_l1e(
 
     if ( unlikely(!pfn_is_ram(pfn)) )
     {
+        /* SPECIAL CASE 1. Mapping an I/O page. */
+
         /* Revert to caller privileges if FD == DOMID_IO. */
         if ( d == dom_io )
             d = current;
@@ -397,17 +403,41 @@ get_page_from_l1e(
         return 0;
     }
 
+    if ( unlikely(!get_page_from_pagenr(pfn, d)) )
+    {
+        /* SPECIAL CASE 2. Mapping a foreign page via a grant table. */
+        
+        int rc;
+        struct domain *e;
+        u32 count_info;
+        /*
+         * Yuk! Amazingly this is the simplest way to get a guaranteed atomic
+         * snapshot of a 64-bit value on IA32. x86/64 solves this of course!
+         * Basically it's a no-op CMPXCHG, to get us the current contents.
+         * No need for LOCK prefix -- we know that count_info is never zero
+         * because it contains PGC_always_set.
+         */
+        __asm__ __volatile__(
+            "cmpxchg8b %2"
+            : "=a" (e), "=d" (count_info),
+              "=m" (*(volatile u64 *)(&page->u.inuse.domain))
+            : "0" (0), "1" (0), "b" (0), "c" (0) );
+        if ( unlikely((count_info & PGC_count_mask) == 0) ||
+             unlikely(e == NULL) || unlikely(!get_domain(e)) )
+             return 0;
+        rc = gnttab_try_map(e, d, page, l1v & _PAGE_RW);
+        put_domain(e);
+        return rc;
+    }
+
     if ( l1v & _PAGE_RW )
     {
-        if ( unlikely(!get_page_and_type_from_pagenr(
-            pfn, PGT_writable_page, d)) )
+        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
             return 0;
-        set_bit(_PGC_tlb_flush_on_type_change, 
-                &frame_table[pfn].u.inuse.count_info);
-        return 1;
+        set_bit(_PGC_tlb_flush_on_type_change, &page->u.inuse.count_info);
     }
 
-    return get_page_from_pagenr(pfn, d);
+    return 1;
 }
 
 
@@ -434,14 +464,33 @@ get_page_from_l2e(
 }
 
 
-static void put_page_from_l1e(l1_pgentry_t l1e)
+static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
 {
     struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
     unsigned long    l1v  = l1_pgentry_val(l1e);
+    struct domain   *e = page->u.inuse.domain;
 
     if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) )
         return;
 
+    if ( unlikely(e != d) )
+    {
+        /*
+         * Unmap a foreign page that may have been mapped via a grant table.
+         * Note that this can fail for a privileged domain that can map foreign
+         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
+         * counted via a grant entry and some counted directly in the page
+         * structure's reference count. Note that reference counts won't get
+         * dangerously confused as long as we always try to decrement the
+         * grant entry first. We may end up with a mismatch between which
+         * mappings and which unmappings are counted via the grant entry, but
+         * really it doesn't matter as privileged domains have carte blanche.
+         */
+        if ( likely(gnttab_try_unmap(e, d, page, l1v & _PAGE_RW)) )
+            return;
+        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
+    }
+
     if ( l1v & _PAGE_RW )
     {
         put_page_and_type(page);
@@ -452,7 +501,7 @@ static void put_page_from_l1e(l1_pgentry_t l1e)
         if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
                        PGT_ldt_page)) &&
              unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
-            invalidate_shadow_ldt(page->u.inuse.domain);
+            invalidate_shadow_ldt(e);
         put_page(page);
     }
 }
@@ -527,7 +576,7 @@ static int alloc_l1_table(struct pfn_info *page)
 
  fail:
     while ( i-- > 0 )
-        put_page_from_l1e(pl1e[i]);
+        put_page_from_l1e(pl1e[i], d);
 
     unmap_domain_mem(pl1e);
     return 0;
@@ -551,6 +600,7 @@ static void free_l2_table(struct pfn_info *page)
 
 static void free_l1_table(struct pfn_info *page)
 {
+    struct domain *d = page->u.inuse.domain;
     unsigned long page_nr = page - frame_table;
     l1_pgentry_t *pl1e;
     int i;
@@ -558,7 +608,7 @@ static void free_l1_table(struct pfn_info *page)
     pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
 
     for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-        put_page_from_l1e(pl1e[i]);
+        put_page_from_l1e(pl1e[i], d);
 
     unmap_domain_mem(pl1e);
 }
@@ -651,6 +701,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
 {
     l1_pgentry_t ol1e;
     unsigned long _ol1e;
+    struct domain *d = current;
 
     if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
     {
@@ -671,18 +722,18 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
         
         if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
         {
-            put_page_from_l1e(nl1e);
+            put_page_from_l1e(nl1e, d);
             return 0;
         }
         
-        put_page_from_l1e(ol1e);
+        put_page_from_l1e(ol1e, d);
         return 1;
     }
 
     if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
         return 0;
     
-    put_page_from_l1e(ol1e);
+    put_page_from_l1e(ol1e, d);
     return 1;
 }
 
@@ -1289,20 +1340,10 @@ int do_update_va_mapping_otherdomain(unsigned long page_nr,
 }
 
 
-static inline int readonly_page_from_l1e(l1_pgentry_t l1e)
-{
-    struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
-    unsigned long    l1v  = l1_pgentry_val(l1e);
-
-    if ( (l1v & _PAGE_RW) || !(l1v & _PAGE_PRESENT) ||
-         !pfn_is_ram(l1v >> PAGE_SHIFT) )
-        return 0;
-    put_page_type(page);
-    return 1;
-}
-
 
-/* Writable Pagetables */
+/*************************
+ * Writable Pagetables
+ */
 
 ptwr_info_t ptwr_info[NR_CPUS] =
     { [ 0 ... NR_CPUS-1 ] =
@@ -1365,13 +1406,8 @@ void ptwr_reconnect_disconnected(unsigned long addr)
         nl1e = pl1e[i];
         if (likely(l1_pgentry_val(nl1e) == l1_pgentry_val(ol1e)))
             continue;
-        if (likely((l1_pgentry_val(nl1e) ^ l1_pgentry_val(ol1e)) ==
-                   _PAGE_RW)) {
-            if (likely(readonly_page_from_l1e(nl1e)))
-                continue;
-        }
         if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
-            put_page_from_l1e(ol1e);
+            put_page_from_l1e(ol1e, current);
         if (unlikely(!get_page_from_l1e(nl1e, current)))
             BUG();
     }
@@ -1438,7 +1474,7 @@ void ptwr_flush_inactive(void)
             if (likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)))
                 continue;
             if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
-                put_page_from_l1e(ol1e);
+                put_page_from_l1e(ol1e, current);
             if (unlikely(!get_page_from_l1e(nl1e, current)))
                 BUG();
         }
index 3d18ebd4ee00f6100fb3980536f05a7fb8d0e21e..975f8a4724bb40fb1791e77e72eef8482f1fe528 100644 (file)
@@ -411,7 +411,7 @@ void __init start_of_day(void)
     clear_bit(smp_processor_id(), &wait_init_idle);
     smp_threads_ready = 1;
     smp_commence(); /* Tell other CPUs that state of the world is stable. */
-    while (wait_init_idle) 
+    while ( wait_init_idle != 0 )
     {
         cpu_relax();
         barrier();
index 768238103223c94ad9371d45f39553794f1de740..55621847d029096a0f42cb123eab0a6bb88ae039 100644 (file)
@@ -232,12 +232,16 @@ void domain_destruct(struct domain *d)
 {
     struct domain **pd;
     unsigned long flags;
+    atomic_t      old, new;
 
     if ( !test_bit(DF_DYING, &d->flags) )
         BUG();
 
     /* May be already destructed, or get_domain() can race us. */
-    if ( cmpxchg(&d->refcnt.counter, 0, DOMAIN_DESTRUCTED) != 0 )
+    _atomic_set(old, 0);
+    _atomic_set(new, DOMAIN_DESTRUCTED);
+    old = atomic_compareandswap(old, new, &d->refcnt);
+    if ( _atomic_read(old) != 0 )
         return;
 
     DPRINTK("Releasing task %u\n", d->domain);
index 27f81b9e229f3c4c613d61fff69d3ca979f80ce0..f76b18f8a86e6a4a88bd92ac9cfb4af0de254667 100644 (file)
 #include <xen/config.h>
 #include <xen/sched.h>
 
+#define PIN_FAIL(_rc, _f, _a...)   \
+    do {                           \
+        DPRINTK( _f, ## _a );      \
+        rc = -(_rc);               \
+        goto out;                  \
+    } while ( 0 )
+
 static inline void
 check_tlb_flush(
     active_grant_entry_t *a)
@@ -70,6 +77,7 @@ gnttab_update_pin_status(
     active_grant_entry_t *act;
     grant_entry_t *sha;
     long           rc = 0;
+    unsigned long  frame;
 
     ld = current;
 
@@ -93,8 +101,11 @@ gnttab_update_pin_status(
         return -EINVAL;
     }
 
-    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) )
+    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
+         unlikely(ld == rd) )
     {
+        if ( rd != NULL )
+            put_domain(rd);
         DPRINTK("Could not find domain %d\n", dom);
         return -ESRCH;
     }
@@ -102,6 +113,8 @@ gnttab_update_pin_status(
     act = &rd->grant_table->active[ref];
     sha = &rd->grant_table->shared[ref];
 
+    spin_lock(&rd->grant_table->lock);
+
     if ( act->status == 0 )
     {
         if ( unlikely(pin_flags == 0) )
@@ -118,23 +131,17 @@ gnttab_update_pin_status(
 
             if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) ||
                  unlikely(sdom != ld->domain) )
-            {
-                DPRINTK("Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
+                PIN_FAIL(EINVAL,
+                         "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
                         sflags, sdom, ld->domain);
-                rc = -EINVAL;
-                goto out;
-            }
 
             sflags |= GTF_reading;
             if ( !(pin_flags & GNTPIN_readonly) )
             {
                 sflags |= GTF_writing;
                 if ( unlikely(sflags & GTF_readonly) )
-                {
-                    DPRINTK("Attempt to write-pin a read-only grant entry.\n");
-                    rc = -EINVAL;
-                    goto out;
-                }
+                    PIN_FAIL(EINVAL,
+                             "Attempt to write-pin a r/o grant entry.\n");
             }
 
             /* Merge two 16-bit values into a 32-bit combined update. */
@@ -144,11 +151,8 @@ gnttab_update_pin_status(
             /* NB. prev_sflags is updated in place to seen value. */
             if ( unlikely(cmpxchg_user((u32 *)&sha->flags, prev_scombo, 
                                        prev_scombo | GTF_writing)) )
-            {
-                DPRINTK("Fault while modifying shared flags and domid.\n");
-                rc = -EINVAL;
-                goto out;
-            }
+                PIN_FAIL(EINVAL,
+                         "Fault while modifying shared flags and domid.\n");
 
             /* Did the combined update work (did we see what we expected?). */
             if ( prev_scombo == scombo )
@@ -161,10 +165,22 @@ gnttab_update_pin_status(
         }
 
         /* rmb(); */ /* not on x86 */
+        frame = sha->frame;
+        if ( unlikely(!pfn_is_ram(frame)) || 
+             unlikely(!((pin_flags & GNTPIN_readonly) ? 
+                        get_page(&frame_table[frame], rd) : 
+                        get_page_and_type(&frame_table[frame], rd, 
+                                          PGT_writable_page))) )
+        {
+            clear_bit(_GTF_writing, &sha->flags);
+            clear_bit(_GTF_reading, &sha->flags);
+            PIN_FAIL(EINVAL, 
+                     "Could not pin the granted frame!\n");
+        }
 
         act->status = pin_flags;
         act->domid  = sdom;
-        act->frame  = sha->frame;
+        act->frame  = frame;
 
         make_entry_mappable(rd->grant_table, act);
     }
@@ -174,11 +190,13 @@ gnttab_update_pin_status(
 
         if ( unlikely((act->status & 
                        (GNTPIN_wmap_mask|GNTPIN_rmap_mask)) != 0) )
-        {
-            DPRINTK("Attempt to deactivate a mapped g.e. (%x)\n", act->status);
-            rc = -EINVAL;
-            goto out;
-        }
+            PIN_FAIL(EINVAL,
+                     "Attempt to deactiv a mapped g.e. (%x)\n", act->status);
+
+        frame = act->frame;
+        if ( !(act->status & GNTPIN_readonly) )
+            put_page_type(&frame_table[frame]);
+        put_page(&frame_table[frame]);
 
         act->status = 0;
         make_entry_unmappable(rd->grant_table, act);
@@ -199,12 +217,9 @@ gnttab_update_pin_status(
              (unlikely((act->status & GNTPIN_wmap_mask) != 0) ||
               (((pin_flags & GNTPIN_host_accessible) == 0) &&
                unlikely((act->status & GNTPIN_rmap_mask) != 0))) )
-        {
-            DPRINTK("Attempt to reduce pinning of a mapped g.e. (%x,%x)\n",
+            PIN_FAIL(EINVAL,
+                     "Attempt to reduce pinning of a mapped g.e. (%x,%x)\n",
                     pin_flags, act->status);
-            rc = -EINVAL;
-            goto out;
-        }
 
         /* Check for changes to host accessibility. */
         if ( pin_flags & GNTPIN_host_accessible )
@@ -220,6 +235,7 @@ gnttab_update_pin_status(
         {
             if ( !(act->status & GNTPIN_readonly) )
             {
+                put_page_type(&frame_table[act->frame]);
                 check_tlb_flush(act);
                 clear_bit(_GTF_writing, &sha->flags);
             }
@@ -231,20 +247,19 @@ gnttab_update_pin_status(
                 prev_sflags = sflags;
 
                 if ( unlikely(prev_sflags & GTF_readonly) )
-                {
-                    DPRINTK("Attempt to write-pin a read-only grant entry.\n");
-                    rc = -EINVAL;
-                    goto out;
-                }
-                
+                    PIN_FAIL(EINVAL,
+                             "Attempt to write-pin a r/o grant entry.\n");
+
+                if ( unlikely(!get_page_type(&frame_table[act->frame],
+                                             PGT_writable_page)) )
+                    PIN_FAIL(EINVAL,
+                             "Attempt to write-pin a unwritable page.\n");
+
                 /* NB. prev_sflags is updated in place to seen value. */
                 if ( unlikely(cmpxchg_user(&sha->flags, prev_sflags, 
                                            prev_sflags | GTF_writing)) )
-                {
-                    DPRINTK("Fault while modifying shared flags.\n");
-                    rc = -EINVAL;
-                    goto out;
-                }
+                    PIN_FAIL(EINVAL,
+                             "Fault while modifying shared flags.\n");
             }
             while ( prev_sflags != sflags );
         }
@@ -261,6 +276,7 @@ gnttab_update_pin_status(
     (void)__put_user(act->frame, &uop->host_phys_addr);
 
  out:
+    spin_unlock(&rd->grant_table->lock);
     put_domain(rd);
     return rc;
 }
@@ -289,6 +305,20 @@ do_grant_table_op(
     return rc;
 }
 
+int
+gnttab_try_map(
+    struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly)
+{
+    return 0;
+}
+
+int
+gnttab_try_unmap(
+    struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly)
+{
+    return 0;
+}
+
 int 
 grant_table_create(
     struct domain *d)
@@ -318,6 +348,7 @@ grant_table_create(
     SHARE_PFN_WITH_DOMAIN(virt_to_page(t->shared), d);
 
     /* Okay, install the structure. */
+    wmb(); /* avoid races with lock-free access to d->grant_table */
     d->grant_table = t;
     return 0;
 
index 6c0775c9d2d5f1e16e6ec035e96fbeacf028977f..3e37bded7dc82e363577fec8d93b305006780afc 100644 (file)
@@ -296,9 +296,19 @@ void cmain(multiboot_info_t *mbi)
     xmem_cache_init();
     xmem_cache_sizes_init(max_page);
 
+    /*
+     * Create a domain-structure allocator. The SLAB_NO_REAP flag is essential!
+     * This is because in some situations a domain's reference count will be
+     * incremented by someone with no other handle on the structure -- this is 
+     * inherently racey because the struct could be freed by the time that the
+     * count is incremented. By specifying 'no-reap' we ensure that, worst
+     * case, they increment some other domain's count, rather than corrupting
+     * a random field in a random structure!
+     * See, for example, arch/x86/memory.c:get_page_from_l1e().
+     */
     domain_struct_cachep = xmem_cache_create(
         "domain_cache", sizeof(struct domain),
-        0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+        0, SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL);
     if ( domain_struct_cachep == NULL )
         panic("No slab cache for task structs.");
 
index 52da9c042bdcf2a7a4da6388467f2377313aaf73..79b8df7452f5cba08e06d3f387c3d8878a1d9e38 100644 (file)
@@ -300,12 +300,21 @@ void init_xenheap_pages(unsigned long ps, unsigned long pe)
 unsigned long alloc_xenheap_pages(int order)
 {
     struct pfn_info *pg;
-    int attempts = 0;
+    int i, attempts = 0;
 
  retry:
     if ( unlikely((pg = alloc_heap_pages(MEMZONE_XEN, order)) == NULL) )
         goto no_memory;
+
     memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
+
+    for ( i = 0; i < (1 << order); i++ )
+    {
+        pg[i].u.inuse.count_info = PGC_always_set;
+        pg[i].u.inuse.domain     = NULL;
+        pg[i].u.inuse.type_info  = 0;
+    }
+
     return (unsigned long)page_to_virt(pg);
 
  no_memory:
@@ -343,7 +352,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order)
 {
     struct pfn_info *pg;
     unsigned long mask, flushed_mask, pfn_stamp, cpu_stamp;
-    int i;
+    int i, j;
 
     ASSERT(!in_irq());
 
@@ -353,19 +362,16 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order)
     flushed_mask = 0;
     for ( i = 0; i < (1 << order); i++ )
     {
-        pg[i].u.inuse.domain    = NULL;
-        pg[i].u.inuse.type_info = 0;
-
         if ( (mask = (pg[i].u.free.cpu_mask & ~flushed_mask)) != 0 )
         {
             pfn_stamp = pg[i].tlbflush_timestamp;
-            for ( i = 0; (mask != 0) && (i < smp_num_cpus); i++ )
+            for ( j = 0; (mask != 0) && (j < smp_num_cpus); j++ )
             {
-                if ( mask & (1<<i) )
+                if ( mask & (1<<j) )
                 {
-                    cpu_stamp = tlbflush_time[i];
+                    cpu_stamp = tlbflush_time[j];
                     if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
-                        mask &= ~(1<<i);
+                        mask &= ~(1<<j);
                 }
             }
             
@@ -376,6 +382,10 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order)
                 flushed_mask |= mask;
             }
         }
+
+        pg[i].u.inuse.count_info = PGC_always_set;
+        pg[i].u.inuse.domain     = NULL;
+        pg[i].u.inuse.type_info  = 0;
     }
 
     if ( d == NULL )
@@ -401,7 +411,7 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order)
     {
         pg[i].u.inuse.domain = d;
         wmb(); /* Domain pointer must be visible before updating refcnt. */
-        pg[i].u.inuse.count_info = PGC_allocated | 1;
+        pg[i].u.inuse.count_info |= PGC_allocated | 1;
         list_add_tail(&pg[i].list, &d->page_list);
     }
 
@@ -418,10 +428,13 @@ void free_domheap_pages(struct pfn_info *pg, int order)
     if ( unlikely(IS_XEN_HEAP_FRAME(pg)) )
     {
         spin_lock_recursive(&d->page_alloc_lock);
+
         for ( i = 0; i < (1 << order); i++ )
             list_del(&pg[i].list);
+
         d->xenheap_pages -= 1 << order;
         drop_dom_ref = (d->xenheap_pages == 0);
+
         spin_unlock_recursive(&d->page_alloc_lock);
     }
     else if ( likely(d != NULL) )
@@ -431,9 +444,8 @@ void free_domheap_pages(struct pfn_info *pg, int order)
 
         for ( i = 0; i < (1 << order); i++ )
         {
-            pg[i].tlbflush_timestamp = tlbflush_clock;
-            pg[i].u.inuse.count_info = 0;
-            pg[i].u.free.cpu_mask    = 1 << d->processor;
+            pg[i].tlbflush_timestamp  = tlbflush_clock;
+            pg[i].u.free.cpu_mask     = 1 << d->processor;
             list_del(&pg[i].list);
         }
 
index b64adaedba187058a311934d91352b66d773ed9e..f2ecf955e33ca0865b58b12c8f46d7de11451cd9 100644 (file)
@@ -2,11 +2,7 @@
 #define __ARCH_X86_ATOMIC__
 
 #include <xen/config.h>
-
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc..
- */
+#include <asm/system.h>
 
 #ifdef CONFIG_SMP
 #define LOCK "lock ; "
 #endif
 
 /*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
- * not some alias that contains the same information.
+ * NB. I've pushed the volatile qualifier into the operations. This allows
+ * fast accessors such as _atomic_read() and _atomic_set() which don't give
+ * the compiler a fit.
  */
-typedef struct { volatile int counter; } atomic_t;
+typedef struct { int counter; } atomic_t;
 
 #define ATOMIC_INIT(i) { (i) }
 
@@ -29,8 +25,9 @@ typedef struct { volatile int counter; } atomic_t;
  * 
  * Atomically reads the value of @v.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
- */ 
-#define atomic_read(v)         ((v)->counter)
+ */
+#define _atomic_read(v)                ((v).counter)
+#define atomic_read(v)         (*(volatile int *)&((v)->counter))
 
 /**
  * atomic_set - set atomic variable
@@ -40,7 +37,8 @@ typedef struct { volatile int counter; } atomic_t;
  * Atomically sets the value of @v to @i.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
  */ 
-#define atomic_set(v,i)                (((v)->counter) = (i))
+#define _atomic_set(v,i)       (((v).counter) = (i))
+#define atomic_set(v,i)                (*(volatile int *)&((v)->counter) = (i))
 
 /**
  * atomic_add - add integer to atomic variable
@@ -54,8 +52,8 @@ static __inline__ void atomic_add(int i, atomic_t *v)
 {
        __asm__ __volatile__(
                LOCK "addl %1,%0"
-               :"=m" (v->counter)
-               :"ir" (i), "m" (v->counter));
+               :"=m" (*(volatile int *)&v->counter)
+               :"ir" (i), "m" (*(volatile int *)&v->counter));
 }
 
 /**
@@ -70,8 +68,8 @@ static __inline__ void atomic_sub(int i, atomic_t *v)
 {
        __asm__ __volatile__(
                LOCK "subl %1,%0"
-               :"=m" (v->counter)
-               :"ir" (i), "m" (v->counter));
+               :"=m" (*(volatile int *)&v->counter)
+               :"ir" (i), "m" (*(volatile int *)&v->counter));
 }
 
 /**
@@ -90,8 +88,8 @@ static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
 
        __asm__ __volatile__(
                LOCK "subl %2,%0; sete %1"
-               :"=m" (v->counter), "=qm" (c)
-               :"ir" (i), "m" (v->counter) : "memory");
+               :"=m" (*(volatile int *)&v->counter), "=qm" (c)
+               :"ir" (i), "m" (*(volatile int *)&v->counter) : "memory");
        return c;
 }
 
@@ -106,8 +104,8 @@ static __inline__ void atomic_inc(atomic_t *v)
 {
        __asm__ __volatile__(
                LOCK "incl %0"
-               :"=m" (v->counter)
-               :"m" (v->counter));
+               :"=m" (*(volatile int *)&v->counter)
+               :"m" (*(volatile int *)&v->counter));
 }
 
 /**
@@ -121,8 +119,8 @@ static __inline__ void atomic_dec(atomic_t *v)
 {
        __asm__ __volatile__(
                LOCK "decl %0"
-               :"=m" (v->counter)
-               :"m" (v->counter));
+               :"=m" (*(volatile int *)&v->counter)
+               :"m" (*(volatile int *)&v->counter));
 }
 
 /**
@@ -140,8 +138,8 @@ static __inline__ int atomic_dec_and_test(atomic_t *v)
 
        __asm__ __volatile__(
                LOCK "decl %0; sete %1"
-               :"=m" (v->counter), "=qm" (c)
-               :"m" (v->counter) : "memory");
+               :"=m" (*(volatile int *)&v->counter), "=qm" (c)
+               :"m" (*(volatile int *)&v->counter) : "memory");
        return c != 0;
 }
 
@@ -160,8 +158,8 @@ static __inline__ int atomic_inc_and_test(atomic_t *v)
 
        __asm__ __volatile__(
                LOCK "incl %0; sete %1"
-               :"=m" (v->counter), "=qm" (c)
-               :"m" (v->counter) : "memory");
+               :"=m" (*(volatile int *)&v->counter), "=qm" (c)
+               :"m" (*(volatile int *)&v->counter) : "memory");
        return c != 0;
 }
 
@@ -181,11 +179,20 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v)
 
        __asm__ __volatile__(
                LOCK "addl %2,%0; sets %1"
-               :"=m" (v->counter), "=qm" (c)
-               :"ir" (i), "m" (v->counter) : "memory");
+               :"=m" (*(volatile int *)&v->counter), "=qm" (c)
+               :"ir" (i), "m" (*(volatile int *)&v->counter) : "memory");
        return c;
 }
 
+static __inline__ atomic_t atomic_compareandswap(
+       atomic_t old, atomic_t new, atomic_t *v)
+{
+       atomic_t rc;
+       rc.counter = 
+               __cmpxchg(&v->counter, old.counter, new.counter, sizeof(int));
+       return rc;
+}
+
 /* Atomic operations are already serializing on x86 */
 #define smp_mb__before_atomic_dec()    barrier()
 #define smp_mb__after_atomic_dec()     barrier()
index 9a26e29f080b1c8d0b07bd6f50a61086a8b98a9c..c07235fa5e70c5cf1102315f5cf7243ce45da210 100644 (file)
@@ -87,9 +87,11 @@ struct pfn_info
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated                29
 #define PGC_allocated                 (1<<_PGC_allocated)
- /* 28-bit count of references to this frame. */
-#define PGC_count_mask                ((1<<29)-1)
-
+ /* This bit is always set, guaranteeing that the count word is never zero. */
+#define _PGC_always_set               28
+#define PGC_always_set                (1<<_PGC_always_set)
+ /* 27-bit count of references to this frame. */
+#define PGC_count_mask                ((1<<28)-1)
 
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)         (1)
@@ -106,7 +108,8 @@ struct pfn_info
         wmb(); /* install valid domain ptr before updating refcnt. */       \
         spin_lock(&(_dom)->page_alloc_lock);                                \
         /* _dom holds an allocation reference */                            \
-        (_pfn)->u.inuse.count_info = PGC_allocated | 1;                     \
+        ASSERT((_pfn)->u.inuse.count_info == PGC_always_set);               \
+        (_pfn)->u.inuse.count_info |= PGC_allocated | 1;                    \
         if ( unlikely((_dom)->xenheap_pages++ == 0) )                       \
             get_knownalive_domain(_dom);                                    \
         list_add_tail(&(_pfn)->list, &(_dom)->xenpage_list);                \
@@ -150,10 +153,8 @@ static inline int get_page(struct pfn_info *page,
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(p != domain) )                 /* Wrong owner? */
         {
-            DPRINTK("Error pfn %08lx: ed=%p(%u), sd=%p(%u),"
-                    " caf=%08x, taf=%08x\n",
-                    page_to_pfn(page), domain, domain->domain,
-                    p, (p && !((x & PGC_count_mask) == 0))?p->domain:999, 
+            DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
+                    page_to_pfn(page), domain, p,
                     x, page->u.inuse.type_info);
             return 0;
         }
@@ -364,26 +365,21 @@ void ptwr_reconnect_disconnected(unsigned long addr);
 void ptwr_flush_inactive(void);
 int ptwr_do_page_fault(unsigned long);
 
-static always_inline void 
-__cleanup_writable_pagetable(
-    const int what)
-{
-    int cpu = smp_processor_id();
-
-    if (what & PTWR_CLEANUP_ACTIVE)
-        if (ptwr_info[cpu].disconnected != ENTRIES_PER_L2_PAGETABLE)
-            ptwr_reconnect_disconnected(0L);
-    if (what & PTWR_CLEANUP_INACTIVE)
-        if (ptwr_info[cpu].writable_idx)
-            ptwr_flush_inactive();
-}
-
-static always_inline void
-cleanup_writable_pagetable(
-    struct domain *d, const int what)
-{
-    if ( unlikely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
-        __cleanup_writable_pagetable(what);
-}
+#define __cleanup_writable_pagetable(_what)                               \
+do {                                                                      \
+    int cpu = smp_processor_id();                                         \
+    if ((_what) & PTWR_CLEANUP_ACTIVE)                                    \
+        if (ptwr_info[cpu].disconnected != ENTRIES_PER_L2_PAGETABLE)      \
+            ptwr_reconnect_disconnected(0L);                              \
+    if ((_what) & PTWR_CLEANUP_INACTIVE)                                  \
+        if (ptwr_info[cpu].writable_idx)                                  \
+            ptwr_flush_inactive();                                        \
+} while ( 0 )
+
+#define cleanup_writable_pagetable(_d, _w)                                \
+    do {                                                                  \
+        if ( unlikely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) ) \
+        __cleanup_writable_pagetable(_w);                                 \
+    } while ( 0 )
 
 #endif /* __ASM_X86_MM_H__ */
index 25c29de2e8018b3f35ead6c743f4de624036f78f..b4d79087c52364c731797798f933802ec6e47a9b 100644 (file)
@@ -1,26 +1,13 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
 #ifndef __ASSEMBLY__
 #include <xen/config.h>
-/*#include <xen/threads.h>*/
-#include <asm/ptrace.h>
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#ifndef __ASSEMBLY__
 #include <asm/fixmap.h>
-#include <asm/bitops.h>
 #include <asm/mpspec.h>
-#ifdef CONFIG_X86_IO_APIC
 #include <asm/io_apic.h>
-#endif
 #include <asm/apic.h>
 #endif
-#endif
 
 #ifdef CONFIG_SMP
 #ifndef __ASSEMBLY__
@@ -37,12 +24,6 @@ extern int pic_mode;
 extern int smp_num_siblings;
 extern int cpu_sibling_map[];
 
-extern void smp_flush_tlb(void);
-extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
-extern void smp_send_reschedule(int cpu);
-extern void smp_invalidate_rcv(void);          /* Process an NMI */
-extern void (*mtrr_hook) (void);
-
 /*
  * On x86 all CPUs are mapped 1:1 to the APIC space.
  * This simplifies scheduling and IPI sending and
index 4b25eec921e6e1592e2681e736d531f449ad7f60..4835b6e236fc49b22e5807daa056185b521e6e0e 100644 (file)
@@ -30,33 +30,33 @@ static always_inline unsigned long __xchg(unsigned long x, volatile void * ptr,
                case 1:
                        __asm__ __volatile__("xchgb %b0,%1"
                                :"=q" (x)
-                               :"m" (*__xg(ptr)), "0" (x)
+                               :"m" (*__xg((volatile void *)ptr)), "0" (x)
                                :"memory");
                        break;
                case 2:
                        __asm__ __volatile__("xchgw %w0,%1"
                                :"=r" (x)
-                               :"m" (*__xg(ptr)), "0" (x)
+                               :"m" (*__xg((volatile void *)ptr)), "0" (x)
                                :"memory");
                        break;
 #if defined(__i386__)
                case 4:
                        __asm__ __volatile__("xchgl %0,%1"
                                :"=r" (x)
-                               :"m" (*__xg(ptr)), "0" (x)
+                               :"m" (*__xg((volatile void *)ptr)), "0" (x)
                                :"memory");
                        break;
 #elif defined(__x86_64__)
                case 4:
                        __asm__ __volatile__("xchgl %k0,%1"
                                :"=r" (x)
-                               :"m" (*__xg(ptr)), "0" (x)
+                               :"m" (*__xg((volatile void *)ptr)), "0" (x)
                                :"memory");
                        break;
                case 8:
                        __asm__ __volatile__("xchgq %0,%1"
                                :"=r" (x)
-                               :"m" (*__xg(ptr)), "0" (x)
+                               :"m" (*__xg((volatile void *)ptr)), "0" (x)
                                :"memory");
                        break;
 #endif
@@ -78,33 +78,33 @@ static always_inline unsigned long __cmpxchg(volatile void *ptr, unsigned long o
        case 1:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
                                     : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "q"(new), "m"(*__xg((volatile void *)ptr)), "0"(old)
                                     : "memory");
                return prev;
        case 2:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
                                     : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old)
                                     : "memory");
                return prev;
 #if defined(__i386__)
        case 4:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
                                     : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old)
                                     : "memory");
                return prev;
 #elif defined(__x86_64__)
        case 4:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
                                     : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old)
                                     : "memory");
                return prev;
        case 8:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
                                     : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "r"(new), "m"(*__xg((volatile void *)ptr)), "0"(old)
                                     : "memory");
                return prev;
 #endif
index 14214864100e0ef37c04f6534dca0a0ceaed35fc..395959323ce60433692f6a38caff8f2ca1373251 100644 (file)
@@ -24,6 +24,8 @@
 #ifndef __XEN_GRANT_H__
 #define __XEN_GRANT_H__
 
+#include <xen/config.h>
+#include <xen/mm.h>
 #include <hypervisor-ifs/grant_table.h>
 
 /* Active grant entry - used for shadowing GTF_permit_access grants. */
@@ -65,10 +67,19 @@ typedef struct {
 } grant_table_t;
 
 /* Start-of-day system initialisation. */
-void grant_table_init(void);
+void grant_table_init(
+    void);
 
 /* Create/destroy per-domain grant table context. */
-int  grant_table_create(struct domain *d);
-void grant_table_destroy(struct domain *d);
+int grant_table_create(
+    struct domain *d);
+void grant_table_destroy(
+    struct domain *d);
+
+/* Create/destroy host-CPU mappings via a grant-table entry. */
+int gnttab_try_map(
+    struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly);
+int gnttab_try_unmap(
+    struct domain *rd, struct domain *ld, struct pfn_info *page, int readonly);
 
 #endif /* __XEN_GRANT_H__ */
index 3c72f6de6a17df140e09a992d74436398ac664fa..51fb070673422553d40a7fd03790539d65ce477b 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef __SCHED_H__
 #define __SCHED_H__
 
+#define STACK_SIZE (2*PAGE_SIZE)
+#define MAX_DOMAIN_NAME 16
+
 #include <xen/config.h>
 #include <xen/types.h>
 #include <xen/spinlock.h>
 #include <asm/processor.h>
 #include <hypervisor-ifs/hypervisor-if.h>
 #include <hypervisor-ifs/dom0_ops.h>
-#include <xen/grant_table.h>
 #include <xen/list.h>
 #include <xen/time.h>
 #include <xen/ac_timer.h>
 #include <xen/delay.h>
 #include <asm/atomic.h>
-
-#define STACK_SIZE (2*PAGE_SIZE)
 #include <asm/current.h>
-
-#define MAX_DOMAIN_NAME 16
+#include <xen/spinlock.h>
+#include <xen/grant_table.h>
 
 extern unsigned long volatile jiffies;
 extern rwlock_t tasklist_lock;
 
-#include <xen/spinlock.h>
-
 struct domain;
 
 typedef struct event_channel_st
@@ -167,10 +165,19 @@ struct domain *alloc_domain_struct();
  * Use this when you don't have an existing reference to @d. It returns
  * FALSE if @d is being destructed.
  */
-static inline int get_domain(struct domain *d)
+static always_inline int get_domain(struct domain *d)
 {
-    atomic_inc(&d->refcnt);
-    return !(atomic_read(&d->refcnt) & DOMAIN_DESTRUCTED);
+    atomic_t old, new, seen = d->refcnt;
+    do
+    {
+        old = seen;
+        if ( unlikely(_atomic_read(old) & DOMAIN_DESTRUCTED) )
+            return 0;
+        _atomic_set(new, _atomic_read(old) + 1);
+        seen = atomic_compareandswap(old, new, &d->refcnt);
+    }
+    while ( unlikely(_atomic_read(seen) != _atomic_read(old)) );
+    return 1;
 }
 
 /*